In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import os
import sys, os
sys.path.insert(0, os.path.abspath('..'))
import data_generation.diff_utils
import data_generation.mwdiff.mwdiffs_to_tsv
import numpy as np

In [2]:
df_raw = pd.read_csv("../../data/toxicity_annotations/raw/toxicity_for_ellery.csv")

In [3]:
df = df_raw.copy()

In [4]:
df.shape


Out[4]:
(1671721, 28)

Clean Annotations


In [5]:
df['query'].value_counts()


Out[5]:
user_blocked       504821
user_random        504800
article_blocked    331055
article_random     331045
Name: query, dtype: int64

In [6]:
df['ns'] = df['query'].apply(lambda x: x.split('_')[0])
df['sample'] = df['query'].apply(lambda x: x.split('_')[1])

Make random and blocked samples disjoint


In [7]:
df.index = df.rev_id
df.sample_count = df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts()
print(df.sample_count.value_counts())
# just set them all to random
df['sample'][df.sample_count == 2] = 'random'
del df.sample_count
print(df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts())


1    166415
2       215
Name: rev_id, dtype: int64
/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
1    166630
Name: rev_id, dtype: int64

Binarize toxicity


In [8]:
df['toxicity'] = (df['toxicity_score'] < 0).apply(int)

In [9]:
df['toxicity_score'].value_counts(dropna=False)


Out[9]:
 0.0    812717
 1.0    572471
-1.0    200512
-2.0     45825
 2.0     23123
NaN      17073
Name: toxicity_score, dtype: int64

In [10]:
df['toxicity'].value_counts(dropna=False)


Out[10]:
0    1425384
1     246337
Name: toxicity, dtype: int64

Remove answers to test questions


In [11]:
df = df.query('_golden == False')
print('# annotations: ', df.shape[0])


# annotations:  1671721

Remove annotations where revision could not be read


In [12]:
from baselines import remove_na
# remove all annotations for a revisions where more than 50% of annotators for that revision could not read the comment
df = remove_na(df)
print('# annotations: ', df.shape[0])


# annotations:  1657460

In [13]:
# remove all annotations where the annotator could not read the comment
df = df.query('na==False')
print('# annotations: ', df.shape[0])


# annotations:  1651459

Make sure that each rev was only annotated by the same worker once


In [14]:
df.groupby(['rev_id', '_worker_id']).size().value_counts()


Out[14]:
1    1651443
2          8
dtype: int64

In [15]:
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
print('# annotations: ', df.shape[0])


# annotations:  1651451

Filter out annotations for revisions with duplicated diff content


In [16]:
comments = df.drop_duplicates(subset = ['rev_id'])
print(comments.shape[0])


165208

In [17]:
u_comments = comments.drop_duplicates(subset = ['comment_text'])
print(u_comments.shape[0])


160588

In [18]:
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])


# annotations:  1605360

Check that labels are not None


In [19]:
df['toxicity_score'].value_counts(dropna=False)


Out[19]:
 0.0    792657
 1.0    558147
-1.0    190434
-2.0     42075
 2.0     22047
Name: toxicity_score, dtype: int64

In [20]:
df['toxicity'].value_counts(dropna=False)


Out[20]:
0    1372851
1     232509
Name: toxicity, dtype: int64

Remove annotations from all revisions that were annotated less than 8 times


In [21]:
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index

In [22]:
counts['n'].value_counts().head()


Out[22]:
10    154088
9       2635
11      2063
8        665
7        336
Name: n, dtype: int64

In [23]:
counts_enough = counts.query("n>=8")

In [24]:
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])


# annotations:  1601838

Get set of labeled comments


In [25]:
df_comments = df.drop_duplicates(subset = ['rev_id']).copy()
df_comments['logged_in'] = df_comments['user_id'].notnull()
df_comments['year'] = pd.to_datetime(df_comments['rev_timestamp']).apply(lambda x: x.year)

Add Splits


In [26]:
elements = np.array(["train", "dev", "test"])
probabilities = np.array([0.6, 0.2, 0.2])
df_comments['split'] = np.random.choice(elements, size=df_comments.shape[0], p=list(probabilities))

In [27]:
df_comments['split'].value_counts()


Out[27]:
train    96334
dev      31869
test     31854
Name: split, dtype: int64

rename workers


In [28]:
df_workers = df[['_worker_id']].drop_duplicates()
df_workers['anon_id'] = range(df_workers.shape[0])
df = df.merge(df_workers, how = 'inner', on = '_worker_id')
df.shape

# save worker id mapping
df_workers.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotations_worker_id_map.tsv'), sep = '\t', index = False)

In [169]:
# fix legacy special token issues

df_comments['diff'] = df_comments['diff'].apply(data_generation.mwdiff.mwdiffs_to_tsv.replace_special_chars)
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('TAB', 'TAB_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('NEWLINE', 'NEWLINE_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('"', '`'))

# apply latest version of clean and filter
df_comments = data_generation.diff_utils.clean_and_filter(df_comments)
# clean and filter drops some comments, so drop associated labels
df = df.merge(df_comments[['rev_id']], how = 'inner', on = 'rev_id' )

In [170]:
# rename some columns
df_comments = df_comments.rename(columns={
                        'clean_diff': 'comment',
                        'rev_timestamp': 'timestamp',
        
                       })
order = ['rev_id', 'comment', 'year', 'logged_in', 'ns', 'sample', 'split']
df_comments = df_comments[order]
df_comments = df_comments.sort_values('rev_id')
df_comments.shape


Out[170]:
(159686, 7)

In [171]:
# get set of human labels

df_toxicity_labels = df[['rev_id', 'anon_id', 'toxicity', 'toxicity_score']]

df_toxicity_labels = df_toxicity_labels.rename(columns={
                        'anon_id': 'worker_id',
                       })

df_toxicity_labels = df_toxicity_labels.sort_values('rev_id')

In [172]:
# save dfs
df_comments.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotated_comments.tsv'), sep = '\t', index = False)
df_toxicity_labels.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotations.tsv'), sep = '\t', index = False)

In [173]:
pd.read_csv(os.path.join( "../../data/figshare", 'toxicity_annotated_comments.tsv'), sep = '\t').shape


Out[173]:
(159686, 7)

In [174]:
pd.read_csv(os.path.join( "../../data/figshare", 'toxicity_annotations.tsv'), sep = '\t').drop_duplicates(subset = 'rev_id').shape


Out[174]:
(159686, 4)

In [176]:
df_comments.head()


Out[176]:
rev_id comment year logged_in ns sample split
1315373 2232.0 This:NEWLINE_TOKEN:One can make an analogy in ... 2002 True article random train
223073 4216.0 `NEWLINE_TOKENNEWLINE_TOKEN:Clarification for ... 2002 True user random train
480113 8953.0 Elected or Electoral? JHK 2002 False article random test
1099396 26547.0 `This is such a fun entry. DevotchkaNEWLINE_... 2002 True article random train
941623 28959.0 Please relate the ozone hole to increases in c... 2002 True article random test

In [ ]: